# ------------------------------------- #
# Replication code for:
#
# Rathbun, Brian C. and Caleb Pomeroy "See No Evil, Speak No Evil? Morality, Evolutionary Psychology, 
# and the Nature of International Relations," International Organization.
#
# This script reproduces the main and supplemental text analysis results.
# See Kozlowski et al (2019) https://doi.org/10.1177/0003122419877135 for more on the 
# embedding projections procedure.
# ------------------------------------- #


# --- set your working directory --- #
setwd("~/Downloads/seeing_evil_replication/")

# --- libraries --- #
library(ggplot2)
library(ggrepel)
library(dplyr)
# NB: be sure that the packages "tm", "lsa", and "quanteda" are installed, as well. they're called up line-level as needed.


# --- define dictionaries for projections --- #
# threat vs. nonthreat words for threat dimension
threat_words <- c("threat", "enemy", "adversary", "danger") 
nonthreat_words <- c("security", "ally", "alliance", "safe")

# harm vs. nonharm words for harm dimension
harm_words <- c("harm", "damage", "violent", "suffer", "kill", "attack","hurt", "destroy")
nonharm_words <- c("help", "benefit", "aid", "protect", "safe", "safeguard")

# moral vs. immoral words to project onto those dimensions
moral_words <- c("honest", "sincere", "giving", "trustworthy", "forgiving", "fair", "caring", "selfless", "compassionate",
                 "empathetic", "loyal", "helpful", "respectful", "principled", "justice", "responsible","humble",
                 "grateful", "truthful", "ethical", "upright", "respectable", "equitable", "virtuous", "faithful",
                 "incorruptible", "decent", "considerate", "unselfish", "humane")
immoral_words <- c("dishonest", "unprincipled", "insincere", "unjust", "greedy", "unfair", "untrustworthy", "uncaring",
                   "selfish", "unforgiving", "disloyal", "arrogant", "disrespectful", "ungrateful", "rude", "inhumane",
                   "unkind", "inconsiderate", "indecent", "corruptible", "unfaithful", "unvirtuous", "unrighteous",
                   "inequitable", "unethical", "untruthful", "ungrateful", "irresponsible", "cruel", "hateful", 
                   "merciless", "unkind")

# stem the dictionaries for the political corpora, which are stemmed.
# but, keep certain terms full-length to retain moral valence. 
# e.g. stemming "humane" returns "human," "considerate" becomes "consider," etc, which loses moral substance
keep_full <- c("faithful", "justice", "caring", "humane", "giving", "considerate", "helpful", "grateful", "truthful")

threat_words_stemmed <- tm::stemDocument(threat_words)
nonthreat_words_stemmed <- tm::stemDocument(nonthreat_words)
harm_words_stemmed <- tm::stemDocument(harm_words)
nonharm_words_stemmed <- tm::stemDocument(nonharm_words)
moral_words_stemmed <- c(tm::stemDocument(moral_words[!c(moral_words %in% keep_full)]), keep_full)
immoral_words_stemmed <- tm::stemDocument(immoral_words)


# --- UNGA projections --- #
# the "word_vectors" folder contains the locally-trained political embeddings. 
# the corpora were resampled and embeddings estimated 20 distinct times, which increases
# robustness of the mean estimates and allows us to produce CIs
file_list <- list.files(path = "data/word_vectors/")
file_list <- file_list[grepl("unga", file_list, fixed = TRUE)] # starting w/ UNGA embeddings

# below, we're loading the distinct embeddings, taking contrasts between mean positions of (non)harm and (non)threat terms 
# to construct the dimensions, and finally calculating the cosine sims between the (im)moral terms 
# and those dimensions, i.e. "projecting" the (im)moral terms onto the dimensions
results_df <- data.frame()
for(k in 1:length(file_list)){

  word_vectors <- readRDS(paste("data/word_vectors/", file_list[k], sep = "")) 
  unga_embeddings <- t(word_vectors)
  
  # --- threat dimension
  threat_embeddings <- unga_embeddings[,colnames(unga_embeddings) %in% threat_words_stemmed]
  threat_embeddings <- if(is.null(nrow(threat_embeddings))){threat_embeddings}else{rowMeans(threat_embeddings)}
  nonthreat_embeddings <- unga_embeddings[,colnames(unga_embeddings) %in% nonthreat_words_stemmed]
  nonthreat_embeddings <- if(is.null(nrow(nonthreat_embeddings))){nonthreat_embeddings}else{rowMeans(nonthreat_embeddings)}
  threat_nonthreat_d <-  threat_embeddings - nonthreat_embeddings
  threat_nonthreat_d_political <- threat_nonthreat_d
  
  # --- harm dimension
  harm_embeddings <- unga_embeddings[,colnames(unga_embeddings) %in% harm_words_stemmed]
  harm_embeddings <- if(is.null(nrow(harm_embeddings))){harm_embeddings}else{rowMeans(harm_embeddings)}
  nonharm_embeddings <- unga_embeddings[,colnames(unga_embeddings) %in% nonharm_words_stemmed]
  nonharm_embeddings <- if(is.null(nrow(nonharm_embeddings))){nonharm_embeddings}else{rowMeans(nonharm_embeddings)}
  harm_nonharm_d <-  harm_embeddings - nonharm_embeddings
  harm_nonharm_d_political <- harm_nonharm_d
  
  # --- moral words
  moral_embeddings <- unga_embeddings[,colnames(unga_embeddings) %in% moral_words_stemmed]
  
  # --- immoral words
  immoral_embeddings <- unga_embeddings[,colnames(unga_embeddings) %in% immoral_words_stemmed]
  moral_embeddings <- cbind(moral_embeddings, immoral_embeddings)
  
  results_harm_df <- data.frame()
  for(i in 1:ncol(moral_embeddings)){
    results_harm_df <- 
      rbind(results_harm_df,
            data.frame(
              term = colnames(moral_embeddings)[i],
              cos_sim = lsa::cosine(moral_embeddings[,i], harm_nonharm_d)
            ))
  }
  
  
  results_threat_df <- data.frame()
  for(i in 1:ncol(moral_embeddings)){
    results_threat_df <- 
      rbind(results_threat_df,
            data.frame(
              term = colnames(moral_embeddings)[i],
              cos_sim = lsa::cosine(moral_embeddings[,i], threat_nonthreat_d)
            ))
  }
  
  # --- classifying on the "correct" side of the dimensions?
  results_harm_df$side <- ifelse(results_harm_df$term %in% colnames(immoral_embeddings), "positive", "negative")
  results_harm_df$correct <- ifelse(results_harm_df$side == "positive" & results_harm_df$cos_sim>0, "true", 
                                    ifelse(results_harm_df$side == "negative" & results_harm_df$cos_sim<0, "true", "false"))
  
  results_threat_df$side <- ifelse(results_threat_df$term %in% colnames(immoral_embeddings), "positive", "negative")
  results_threat_df$correct <- ifelse(results_threat_df$side == "positive" & results_threat_df$cos_sim>0, "true", 
                                      ifelse(results_threat_df$side == "negative" & results_threat_df$cos_sim<0, "true", "false"))
  
  results_harm_df$embeddings <-  file_list[k]
  results_harm_df$type <- "harm"
  results_threat_df$embeddings <- file_list[k]
  results_threat_df$type <- "threat"
  
  results_df <- 
    rbind(results_df, 
          rbind(results_harm_df, results_threat_df)
    )
  
}


# take the 2nd smallest and 19th largest cosine estimates to 
# construct a nonparametric CI, as in Kozlowski et al (2019) 
unique_terms <- as.character(unique(results_df$term))
ci_results_df <- data.frame()
for(i in 1:length(unique_terms)){
  cos_sims <- sort(subset(results_df, term == unique_terms[i] & type == "harm")$cos_sim)
  ci_results_df <- rbind(
    ci_results_df,
    data.frame(
      term = unique_terms[i],
      lb = cos_sims[2],
      ub = cos_sims[19],
      cos_mean = mean(cos_sims),
      type = "harm",
      embeddings = "UNGA"
    )
  )
}

unga_harm_means <- ci_results_df
unga_harm_means$side <- ifelse(unga_harm_means$term %in% immoral_words_stemmed, "positive", "negative")
unga_harm_means$correct <- ifelse(unga_harm_means$side == "positive" & unga_harm_means$cos_mean>0, "true", 
                                  ifelse(unga_harm_means$side == "negative" & unga_harm_means$cos_mean<0, "true", "false"))
unga_harm_means$ci_sig <- ifelse(ci_results_df$lb < 0 & ci_results_df$ub < 0 |
                                   ci_results_df$lb > 0 & ci_results_df$ub > 0, "true", "false")
ci_results_df <- ci_results_df[order(ci_results_df$cos_mean),]

# --- Figure A3, harm dimension --- #
ggplot(ci_results_df, aes(x = cos_mean, y = reorder(term, cos_mean))) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "gray40") +
  geom_errorbar(aes(xmin= lb, xmax=ub, width=0),size=1, color = "#55626e") +
  geom_point(size = 2) +
  geom_point(size = 1.8, color = "#40939e") +
  theme_bw() +
  labs(x = "Harm Dimension", y=NULL) +
  theme(axis.text.x = element_text(size = 14),
        axis.text.y = element_text(size = 14),
        axis.title.x = element_text(size = 20, margin = margin(t=10)))
#ggsave("harm_unga_cis_rr.pdf", width = 5, height = 12)


unique_terms <- as.character(unique(results_df$term))
ci_results_df <- data.frame()
for(i in 1:length(unique_terms)){
  cos_sims <- sort(subset(results_df, term == unique_terms[i] & type == "threat")$cos_sim)
  ci_results_df <- rbind(
    ci_results_df,
    data.frame(
      term = unique_terms[i],
      lb = cos_sims[2],
      ub = cos_sims[19],
      cos_mean = mean(cos_sims),
      type = "threat",
      embeddings = "UNGA"
    )
  )
}
unga_threat_means <- ci_results_df
unga_threat_means$side <- ifelse(unga_threat_means$term %in% immoral_words_stemmed, "positive", "negative")
unga_threat_means$correct <- ifelse(unga_threat_means$side == "positive" & unga_threat_means$cos_mean>0, "true", 
                                    ifelse(unga_threat_means$side == "negative" & unga_threat_means$cos_mean<0, "true", "false"))
unga_threat_means$ci_sig <- ifelse(ci_results_df$lb < 0 & ci_results_df$ub < 0 |
                                     ci_results_df$lb > 0 & ci_results_df$ub > 0, "true", "false")
ci_results_df <- ci_results_df[order(ci_results_df$cos_mean),]

# --- Figure A3, threat dimension --- #
ggplot(ci_results_df, aes(x = cos_mean, y = reorder(term, cos_mean))) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "gray40") +
  geom_errorbar(aes(xmin= lb, xmax=ub, width=0),size=1, color = "#55626e") +
  geom_point(size = 2) +
  geom_point(size = 1.8, color = "#40939e") +
  theme_bw() +
  labs(x = "Threat Dimension", y=NULL) +
  theme(axis.text.x = element_text(size = 14),
        axis.text.y = element_text(size = 14),
        axis.title.x = element_text(size = 20, margin = margin(t=10)))
#ggsave("threat_unga_cis.pdf", width = 5, height = 12)




# --- FRUS projections --- #
# do the same as above, but now for the FRUS corpus
file_list <- list.files(path = "data/word_vectors/")
file_list <- file_list[grepl("fruscombined", file_list, fixed = TRUE)]

results_df <- data.frame()
for(k in 1:length(file_list)){

  word_vectors <- readRDS(paste("data/word_vectors/", file_list[k], sep = "")) 
  frus_embeddings <- t(word_vectors)
  
  # --- threat dimension
  threat_embeddings <- frus_embeddings[,colnames(frus_embeddings) %in% threat_words_stemmed]
  threat_embeddings <- if(is.null(nrow(threat_embeddings))){threat_embeddings}else{rowMeans(threat_embeddings)}
  nonthreat_embeddings <- frus_embeddings[,colnames(frus_embeddings) %in% nonthreat_words_stemmed]
  nonthreat_embeddings <- if(is.null(nrow(nonthreat_embeddings))){nonthreat_embeddings}else{rowMeans(nonthreat_embeddings)}
  threat_nonthreat_d <-  threat_embeddings - nonthreat_embeddings
  threat_nonthreat_d_political <- threat_nonthreat_d
  
  # --- harm dimension
  harm_embeddings <- frus_embeddings[,colnames(frus_embeddings) %in% harm_words_stemmed]
  harm_embeddings <- if(is.null(nrow(harm_embeddings))){harm_embeddings}else{rowMeans(harm_embeddings)}
  nonharm_embeddings <- frus_embeddings[,colnames(frus_embeddings) %in% nonharm_words_stemmed]
  nonharm_embeddings <- if(is.null(nrow(nonharm_embeddings))){nonharm_embeddings}else{rowMeans(nonharm_embeddings)}
  harm_nonharm_d <-  harm_embeddings - nonharm_embeddings
  harm_nonharm_d_political <- harm_nonharm_d
  
  # --- moral words
  moral_embeddings <- frus_embeddings[,colnames(frus_embeddings) %in% moral_words_stemmed]
  
  # --- immoral words
  immoral_embeddings <- frus_embeddings[,colnames(frus_embeddings) %in% immoral_words_stemmed]
  moral_embeddings <- cbind(moral_embeddings, immoral_embeddings)
  
  results_harm_df <- data.frame()
  for(i in 1:ncol(moral_embeddings)){
    results_harm_df <- 
      rbind(results_harm_df,
            data.frame(
              term = colnames(moral_embeddings)[i],
              cos_sim = lsa::cosine(moral_embeddings[,i], harm_nonharm_d)
            ))
  }
  
  
  results_threat_df <- data.frame()
  for(i in 1:ncol(moral_embeddings)){
    results_threat_df <- 
      rbind(results_threat_df,
            data.frame(
              term = colnames(moral_embeddings)[i],
              cos_sim = lsa::cosine(moral_embeddings[,i], threat_nonthreat_d)
            ))
  }
  
  
  # --- classifying on the "correct" side of the dimensions?
  results_harm_df$side <- ifelse(results_harm_df$term %in% colnames(immoral_embeddings), "positive", "negative")
  results_harm_df$correct <- ifelse(results_harm_df$side == "positive" & results_harm_df$cos_sim>0, "true", 
                                    ifelse(results_harm_df$side == "negative" & results_harm_df$cos_sim<0, "true", "false"))
  
  results_threat_df$side <- ifelse(results_threat_df$term %in% colnames(immoral_embeddings), "positive", "negative")
  results_threat_df$correct <- ifelse(results_threat_df$side == "positive" & results_threat_df$cos_sim>0, "true", 
                                      ifelse(results_threat_df$side == "negative" & results_threat_df$cos_sim<0, "true", "false"))
  
  results_harm_df$embeddings <-  file_list[k]
  results_harm_df$type <- "harm"
  results_threat_df$embeddings <- file_list[k]
  results_threat_df$type <- "threat"
  
  results_df <- 
    rbind(results_df, 
          rbind(results_harm_df, results_threat_df)
    )
}

# CIs for the FRUS terms
unique_terms <- as.character(unique(results_df$term))
ci_results_df <- data.frame()
for(i in 1:length(unique_terms)){
  cos_sims <- sort(subset(results_df, term == unique_terms[i] & type == "harm")$cos_sim)
  ci_results_df <- rbind(
    ci_results_df,
    data.frame(
      term = unique_terms[i],
      lb = cos_sims[2],
      ub = cos_sims[19],
      cos_mean = mean(cos_sims),
      type = "harm",
      embeddings = "FRUS"
    )
  )
}

frus_harm_means <- ci_results_df
frus_harm_means$side <- ifelse(frus_harm_means$term %in% immoral_words_stemmed, "positive", "negative")
frus_harm_means$correct <- ifelse(frus_harm_means$side == "positive" & frus_harm_means$cos_mean>0, "true", 
                                  ifelse(frus_harm_means$side == "negative" & frus_harm_means$cos_mean<0, "true", "false"))
frus_harm_means$ci_sig <- ifelse(ci_results_df$lb < 0 & ci_results_df$ub < 0 |
                                   ci_results_df$lb > 0 & ci_results_df$ub > 0, "true", "false")
ci_results_df <- ci_results_df[order(ci_results_df$cos_mean),]

# --- Figure A2, harm dimension --- #
ggplot(ci_results_df, aes(x = cos_mean, y = reorder(term, cos_mean))) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "gray40") +
  geom_errorbar(aes(xmin= lb, xmax=ub, width=0),size=1, color = "#55626e") +
  geom_point(size = 2) +
  geom_point(size = 1.8, color = "lightpink3") +
  theme_bw() +
  labs(x = "Harm Dimension", y=NULL) +
  theme(axis.text.x = element_text(size = 14),
        axis.text.y = element_text(size = 14),
        axis.title.x = element_text(size = 20, margin = margin(t=10)))
#ggsave("harm_frus_cis.pdf", width = 5, height = 12)


unique_terms <- as.character(unique(results_df$term))
ci_results_df <- data.frame()
for(i in 1:length(unique_terms)){
  cos_sims <- sort(subset(results_df, term == unique_terms[i] & type == "threat")$cos_sim)
  ci_results_df <- rbind(
    ci_results_df,
    data.frame(
      term = unique_terms[i],
      lb = cos_sims[2],
      ub = cos_sims[19],
      cos_mean = mean(cos_sims),
      type = "threat",
      embeddings = "FRUS"
    )
  )
}
frus_threat_means <- ci_results_df
frus_threat_means$side <- ifelse(frus_threat_means$term %in% immoral_words_stemmed, "positive", "negative")
frus_threat_means$correct <- ifelse(frus_threat_means$side == "positive" & frus_threat_means$cos_mean>0, "true", 
                                    ifelse(frus_threat_means$side == "negative" & frus_threat_means$cos_mean<0, "true", "false"))
frus_threat_means$ci_sig <- ifelse(ci_results_df$lb < 0 & ci_results_df$ub < 0 |
                                     ci_results_df$lb > 0 & ci_results_df$ub > 0, "true", "false")
ci_results_df <- ci_results_df[order(ci_results_df$cos_mean),]

# --- Figure A2, threat dimension --- #
ggplot(ci_results_df, aes(x = cos_mean, y = reorder(term, cos_mean))) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "gray40") +
  geom_errorbar(aes(xmin= lb, xmax=ub, width=0),size=1, color = "#55626e") +
  geom_point(size = 2) +
  geom_point(size = 1.8, color = "lightpink3") +
  theme_bw() +
  labs(x = "Threat Dimension", y=NULL) +
  theme(axis.text.x = element_text(size = 14),
        axis.text.y = element_text(size = 14),
        axis.title.x = element_text(size = 20, margin = margin(t=10)))
#ggsave("threat_frus_cis.pdf", width = 5, height = 12)



# --- combine with GloVe --- #
# now, we do the same for the non-political corpus. we use the standard, pre-trained
# 200D GloVe vectors, available in our rep files or at https://nlp.stanford.edu/projects/glove/
word_vectors <- readRDS("data/word_vectors/glove_200d_vectors.rds")
glove_embeddings <- t(word_vectors)

# --- threat dimension
threat_embeddings <- glove_embeddings[,colnames(glove_embeddings) %in% threat_words]
threat_embeddings <- if(is.null(nrow(threat_embeddings))){threat_embeddings}else{rowMeans(threat_embeddings)}
nonthreat_embeddings <- glove_embeddings[,colnames(glove_embeddings) %in% nonthreat_words]
nonthreat_embeddings <- if(is.null(nrow(nonthreat_embeddings))){nonthreat_embeddings}else{rowMeans(nonthreat_embeddings)}
threat_nonthreat_d <-  threat_embeddings - nonthreat_embeddings
threat_nonthreat_d_glove <- threat_nonthreat_d

# --- harm dimension
harm_embeddings <- glove_embeddings[,colnames(glove_embeddings) %in% harm_words]
harm_embeddings <- if(is.null(nrow(harm_embeddings))){harm_embeddings}else{rowMeans(harm_embeddings)}
nonharm_embeddings <- glove_embeddings[,colnames(glove_embeddings) %in% nonharm_words]
nonharm_embeddings <- if(is.null(nrow(nonharm_embeddings))){nonharm_embeddings}else{rowMeans(nonharm_embeddings)}
harm_nonharm_d <-  harm_embeddings - nonharm_embeddings
harm_nonharm_d_glove <- harm_nonharm_d

# --- moral words
moral_embeddings <- glove_embeddings[,colnames(glove_embeddings) %in% moral_words]

# --- immoral words
immoral_embeddings <- glove_embeddings[,colnames(glove_embeddings) %in% immoral_words]
moral_embeddings <- cbind(moral_embeddings, immoral_embeddings)

results_harm_df <- data.frame()
for(i in 1:ncol(moral_embeddings)){
  
  results_harm_df <- 
    rbind(results_harm_df,
          data.frame(
            term = colnames(moral_embeddings)[i],
            cos_sim = lsa::cosine(moral_embeddings[,i], harm_nonharm_d)
          ))
}

results_threat_df <- data.frame()
for(i in 1:ncol(moral_embeddings)){
  
  results_threat_df <- 
    rbind(results_threat_df,
          data.frame(
            term = colnames(moral_embeddings)[i],
            cos_sim = lsa::cosine(moral_embeddings[,i], threat_nonthreat_d)
          ))
}

# --- classifying on the "correct" side of the dimensions?
results_harm_df$side <- ifelse(results_harm_df$term %in% colnames(immoral_embeddings), "positive", "negative")
results_harm_df$correct <- ifelse(results_harm_df$side == "positive" & results_harm_df$cos_sim>0, "true", 
                                  ifelse(results_harm_df$side == "negative" & results_harm_df$cos_sim<0, "true", "false"))

results_threat_df$side <- ifelse(results_threat_df$term %in% colnames(immoral_embeddings), "positive", "negative")
results_threat_df$correct <- ifelse(results_threat_df$side == "positive" & results_threat_df$cos_sim>0, "true", 
                                    ifelse(results_threat_df$side == "negative" & results_threat_df$cos_sim<0, "true", "false"))

results_harm_df$embeddings <-  "glove"
results_harm_df$type <- "harm"
results_threat_df$embeddings <- "glove"
results_threat_df$type <- "threat"

results_df <- 
  rbind(results_df, 
        rbind(results_harm_df, results_threat_df)
  )




# --- quotidian embeddings permutation test --- #

# --- harm
perm_results_list <- list()
set.seed(1912)
for(k in 1:ncol(moral_embeddings)){
  moral_term_k <- moral_embeddings[,k]
  df_temp <- data.frame()
  for(i in 1:2000){
    perm_v <- sample(moral_term_k, size = length(moral_term_k),  replace = F)
    df_temp <-
      rbind(df_temp,
            data.frame(type = "null",
                       value = lsa::cosine(perm_v, harm_nonharm_d),
                       term = colnames(moral_embeddings)[k]))
  }
  perm_results_list[[k]] <- df_temp
  print(k)
}

perm_results_df <- bind_rows(perm_results_list) # warnings are fine, simply coercing to characters

h_test <- data.frame()
for(i in 1:length(unique(perm_results_df$term))){
  sub_i <- subset(perm_results_df, term == unique(perm_results_df$term)[i])
  p_val <- 
    ifelse(unique(perm_results_df$term)[i] %in% moral_words,
           sum(subset(results_harm_df, term == unique(perm_results_df$term)[i])$cos_sim > sub_i$value)/length(sub_i$value),
           sum(subset(results_harm_df, term == unique(perm_results_df$term)[i])$cos_sim < sub_i$value)/length(sub_i$value)
    ) 
  h_test <- 
    rbind(h_test,
          data.frame(p_val = p_val,
                     term = unique(perm_results_df$term)[i],
                     test = ifelse(p_val <.10, "true", "false"),
                     mean_cos = subset(results_harm_df, term == unique(perm_results_df$term)[i])$cos_sim))
  
}

h_test_harm <- h_test

# --- threat
perm_results_list <- list()
set.seed(1912)
for(k in 1:ncol(moral_embeddings)){
  moral_term_k <- moral_embeddings[,k]
  df_temp <- data.frame()
  for(i in 1:2000){
    perm_v <- sample(moral_term_k, size = length(moral_term_k),  replace = F)
    df_temp <-
      rbind(df_temp,
            data.frame(type = "null",
                       value = lsa::cosine(perm_v, threat_nonthreat_d), 
                       term = colnames(moral_embeddings)[k]))
  }
  perm_results_list[[k]] <- df_temp
  print(k)
}

perm_results_df <- bind_rows(perm_results_list)

h_test <- data.frame()
for(i in 1:length(unique(perm_results_df$term))){
  
  sub_i <- subset(perm_results_df, term == unique(perm_results_df$term)[i])
  
  p_val <- 
    ifelse(unique(perm_results_df$term)[i] %in% moral_words,
           sum(subset(results_threat_df, term == unique(perm_results_df$term)[i])$cos_sim > sub_i$value)/length(sub_i$value),
           sum(subset(results_threat_df, term == unique(perm_results_df$term)[i])$cos_sim < sub_i$value)/length(sub_i$value)
    ) 
  
  h_test <- 
    rbind(h_test,
          data.frame(p_val = p_val,
                     term = unique(perm_results_df$term)[i],
                     test = ifelse(p_val <.10, "true", "false"),
                     mean_cos = subset(results_threat_df, term == unique(perm_results_df$term)[i])$cos_sim))
  
}

h_test_threat <- h_test

# overall, ~59.3% of terms significantly diverge from a null distribution,
# according to the permutation test
sum(h_test_harm$test=="true", h_test_threat$test=="true")/sum(nrow(h_test_harm),nrow(h_test_threat))

# for the individual dimensions...
#sum(h_test_harm$test=="true")/nrow(h_test_harm) # for harm: 34/59=57.6%
#sum(h_test_threat$test=="true")/nrow(h_test_threat) # for threat: 36/59 = 61.0%

# classify any of these more robust terms incorrectly?...
h_test_harm_sig <- subset(h_test_harm, test == "true")

h_test_harm_sig$sig_correct <- 
  ifelse(h_test_harm_sig$term %in% moral_words & h_test_harm_sig$mean_cos <0 |
           h_test_harm_sig$term %in% immoral_words & h_test_harm_sig$mean_cos >0, "correct","wrong")

h_test_threat_sig <- subset(h_test_threat, test == "true")

h_test_threat_sig$sig_correct <- 
  ifelse(h_test_threat_sig$term %in% moral_words & h_test_threat_sig$mean_cos <0 |
           h_test_threat_sig$term %in% immoral_words & h_test_threat_sig$mean_cos >0, "correct","wrong")

#...no, the more robust moral terms all fall on the expected sides of the dimensions
sum(h_test_threat_sig$sig_correct=="wrong")/nrow(h_test_threat_sig)
sum(h_test_harm_sig$sig_correct=="wrong")/nrow(h_test_harm_sig)



# --- terms from the political corpora that significantly diverge from zero...
political_corpora <- rbind(unga_harm_means, unga_threat_means, frus_harm_means, frus_threat_means)

# ~56.0% for the FRUS; note that NAs in the "ci_sig" column indicate that the term was 
# not present in each of the 20 resampled corpora. i.e., the term certainly exists in the corpus, but 
# does not appear frequently enough to estimate a reliable confidence interval
sum(na.omit(subset(political_corpora, embeddings == "FRUS")$ci_sig =="true"))/length((na.omit(subset(political_corpora, embeddings == "FRUS")$ci_sig =="true")))
# ~59.5% for the UNGA
sum(na.omit(subset(political_corpora, embeddings == "UNGA")$ci_sig =="true"))/length((na.omit(subset(political_corpora, embeddings == "UNGA")$ci_sig =="true")))

# --- do we misclassify any of the more robust terms in the politica corpora?...
political_corpora_sig <- subset(political_corpora, ci_sig == "true")
subset(political_corpora_sig, correct == "false")
#...yes, "virtuous" and "ethic" in the UNGA and "honest" in the FRUS
political_corpora$lb <- NULL
political_corpora$ub <- NULL
political_corpora$ci_sig <- NULL



# --- combined plot for the main text --- #

plot_df <- 
  rbind(political_corpora,
        data.frame(term = results_harm_df$term,
                   cos_mean = results_harm_df$cos_sim,
                   type = results_harm_df$type,
                   embeddings = "Quotidian",
                   side = results_harm_df$side,
                   correct = results_harm_df$correct),
        data.frame(term = results_threat_df$term,
                   cos_mean = results_threat_df$cos_sim,
                   type = results_threat_df$type,
                   embeddings = "Quotidian",
                   side = results_threat_df$side,
                   correct = results_threat_df$correct))


plot_df$embeddings <- ifelse(plot_df$embeddings == "FRUS", "FRUS (1952-77)  ",
                             ifelse(plot_df$embeddings == "UNGA", "UNGA (1965-2018)  ", 
                                    ifelse(plot_df$embeddings== "Quotidian", "Quotidian Texts  ", NA)))

plot_df$embeddings <- factor(plot_df$embeddings, levels = c("FRUS (1952-77)  ",
                                                            "UNGA (1965-2018)  ",
                                                            "Quotidian Texts  "))

plot_df$y_pos <- 
  ifelse(plot_df$embeddings ==  "FRUS (1952-77)  ", .8, 
         ifelse(plot_df$embeddings ==  "UNGA (1965-2018)  ", 1.4, 
                ifelse(plot_df$embeddings == "Quotidian Texts  ", 2, NA)))

plot_df$name_labels <- as.character(plot_df$term)


# --- Figure 1(A), harm dimension --- #
ggplot(subset(plot_df, type == "harm"), aes(colour = embeddings)) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "gray60") +
  geom_segment(y = .8, yend=.8, x = -.42, xend = .42, arrow = arrow(length=unit(0.4,"cm"), ends="both", type = "closed"),
               size = 1.2,  colour = "grey40") +
  geom_segment(y = 1.4, yend=1.4, x = -.42, xend = .42, arrow = arrow(length=unit(0.4,"cm"), ends="both", type = "closed"),
               size = 1.2,  colour = "grey40") +
  geom_segment(y = 2, yend= 2, x = -.42, xend = .42, arrow = arrow(length=unit(0.4,"cm"), ends="both", type = "closed"),
               size = 1.2,  colour = "grey40") +
  geom_point(aes(x = cos_mean, y = y_pos, group = embeddings), size = 5,colour = "black") +
  geom_point(aes(x = cos_mean, y = y_pos, group = embeddings, colour = embeddings), size = 4) +
  geom_text_repel(aes(x = cos_mean, y = y_pos, group = embeddings, label = name_labels), colour = "black", segment.colour="gray50", segment.size = .4,
                  nudge_y = 0.15, direction = "y", hjust = 0, size = 7, force = .2) +
  scale_colour_manual(values = c("lightpink3", "#40939e", "gray60", "gray90"), guide = guide_legend(reverse = F)) +
  theme_bw() +
  coord_flip(xlim = c(-.4, .4), ylim = c(.7,2.45)) +
  guides(colour = guide_legend(override.aes = list(size=10))) +
  scale_x_continuous(breaks = c(-.5, -.4, -.3, -.2, -.1, 0, .1, .2, .3)) +
  labs(x = "Harm Dimension") +
  theme(axis.title.x = element_blank() ,
        axis.title.y = element_text(size=40, margin = margin(r=24)),
        axis.text.y=element_text(size=30),
        axis.text.x=element_blank(),
        legend.title = element_blank(),
        axis.ticks.y = element_line(color="black", size = .2),
        axis.ticks.x = element_blank(),
        axis.line.x =  element_blank(),
        legend.position = "top",
        legend.spacing.x = unit(.4, 'cm'),
        legend.box.background = element_rect(colour = "gray30"),
        legend.text=element_text(size=30),
        plot.margin = margin(r = 160, l=40, t = 10, b = 10))
#ggsave("harm_dimensions_customstems_means.pdf", width = 16, height = 22)


# --- Figure 1(B), threat dimension --- #
ggplot(subset(plot_df, type == "threat"), aes(colour = embeddings)) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "gray60") +
  geom_segment(y = .8, yend=.8, x = -.42, xend = .42, arrow = arrow(length=unit(0.4,"cm"), ends="both", type = "closed"),
               size = 1.2,  colour = "grey40") +
  geom_segment(y = 1.4, yend=1.4, x = -.42, xend = .42, arrow = arrow(length=unit(0.4,"cm"), ends="both", type = "closed"),
               size = 1.2,  colour = "grey40") +
  geom_segment(y = 2, yend= 2, x = -.42, xend = .42, arrow = arrow(length=unit(0.4,"cm"), ends="both", type = "closed"),
               size = 1.2,  colour = "grey40") +
  geom_point(aes(x = cos_mean, y = y_pos, group = embeddings), size = 5,colour = "black") +
  geom_point(aes(x = cos_mean, y = y_pos, group = embeddings, colour = embeddings), size = 4) +
  geom_text_repel(aes(x = cos_mean, y = y_pos, group = embeddings, label = name_labels), colour = "black", segment.colour="gray50", segment.size = .4,
                  nudge_y = 0.15, direction = "y", hjust = 0, size = 7, force = .2) +
  scale_colour_manual(values = c("lightpink3", "#40939e", "gray60", "gray90"), guide = guide_legend(reverse = F)) +
  theme_bw() +
  coord_flip(xlim = c(-.4, .4), ylim = c(.7,2.45)) +
  guides(colour = guide_legend(override.aes = list(size=10))) +
  scale_x_continuous(breaks = c(-.5, -.4, -.3, -.2, -.1, 0, .1, .2, .3)) +
  labs(x = "Threat Dimension") +
  theme(axis.title.x = element_blank() ,
        axis.title.y = element_text(size=40, margin = margin(r=24)),
        axis.text.y=element_text(size=30),
        axis.text.x=element_blank(),
        legend.title = element_blank(),
        axis.ticks.y = element_line(color="black", size = .2),
        axis.ticks.x = element_blank(),
        axis.line.x =  element_blank(),
        legend.position = "top",
        legend.spacing.x = unit(.4, 'cm'),
        legend.box.background = element_rect(colour = "gray30"),
        legend.text=element_text(size=30),
        plot.margin = margin(r = 160, l=40, t = 10, b = 10))
#ggsave("threat_dimensions_customstems_means.pdf", width = 16, height = 22)


# --- overall classification rates
#sum(plot_df$correct=="true")/nrow(plot_df)

# ~87.8% correct for harm dimension
nrow(subset(plot_df, correct == "true" & type == "harm"))/
  nrow(subset(plot_df, type == "harm"))

# ~81.3% correct for the threat dimension
nrow(subset(plot_df, correct == "true" & type == "threat"))/
  nrow(subset(plot_df, type == "threat"))





# --- Threat vs. regime type embedding dimensions --- #

democ_words <- c("democracy", "democratic", "democratically")
nondemoc_words <- c("dictator", "dictatorship", "dictatorial", "autocracy", "autocratic", "autocratically")
democ_words_stemmed <- tm::stemDocument(democ_words)
nondemoc_words_stemmed <- tm::stemDocument(nondemoc_words)

# --- UNGA:
file_list <- list.files(path = "data/word_vectors/")
file_list <- file_list[grepl("unga", file_list, fixed = TRUE)] # starting w/ UNGA embeddings

results_regime_df <- data.frame()
for(k in 1:length(file_list)){
  
  word_vectors <- readRDS(paste("data/word_vectors/", file_list[k], sep = ""))
  unga_embeddings <- t(word_vectors)
  
  # --- threat dimension
  threat_embeddings <- unga_embeddings[,colnames(unga_embeddings) %in% threat_words_stemmed]
  threat_embeddings <- if(is.null(nrow(threat_embeddings))){threat_embeddings}else{rowMeans(threat_embeddings)}
  nonthreat_embeddings <- unga_embeddings[,colnames(unga_embeddings) %in% nonthreat_words_stemmed]
  nonthreat_embeddings <- if(is.null(nrow(nonthreat_embeddings))){nonthreat_embeddings}else{rowMeans(nonthreat_embeddings)}
  threat_nonthreat_d <-  threat_embeddings - nonthreat_embeddings
  threat_nonthreat_d_unga <- threat_nonthreat_d
  
  # --- regime type dimension
  democ_embeddings <- unga_embeddings[,colnames(unga_embeddings) %in% democ_words_stemmed]
  democ_embeddings <- if(is.null(nrow(democ_embeddings))){democ_embeddings}else{rowMeans(democ_embeddings)}
  nondemoc_embeddings <- unga_embeddings[,colnames(unga_embeddings) %in% nondemoc_words_stemmed]
  nondemoc_embeddings <- if(is.null(nrow(nondemoc_embeddings))){nondemoc_embeddings}else{rowMeans(nondemoc_embeddings)}
  democ_nondemoc_d <-  democ_embeddings - nondemoc_embeddings
  democ_nondemoc_d_unga <- democ_nondemoc_d
  
  # --- moral words
  moral_embeddings <- unga_embeddings[,colnames(unga_embeddings) %in% moral_words_stemmed]
  
  # --- immoral words
  immoral_embeddings <- unga_embeddings[,colnames(unga_embeddings) %in% immoral_words_stemmed]
  moral_embeddings <- cbind(moral_embeddings, immoral_embeddings)
  
  results_democ_df <- data.frame()
  for(i in 1:ncol(moral_embeddings)){
    results_democ_df <-
      rbind(results_democ_df,
            data.frame(
              term = colnames(moral_embeddings)[i],
              cos_sim = lsa::cosine(moral_embeddings[,i], democ_nondemoc_d)
            ))
    
  }
  results_threat_df <- data.frame()
  for(i in 1:ncol(moral_embeddings)){
    results_threat_df <-
      rbind(results_threat_df,
            data.frame(
              term = colnames(moral_embeddings)[i],
              cos_sim = lsa::cosine(moral_embeddings[,i], threat_nonthreat_d)
            ))
  }
  results_democ_df$embeddings <-  file_list[k]
  results_democ_df$type <- "democ"
  results_threat_df$embeddings <- file_list[k]
  results_threat_df$type <- "threat"
  results_regime_df <- rbind(results_regime_df,
                             rbind(results_democ_df, results_threat_df))
}


unique_terms <- as.character(unique(results_regime_df$term))
unga_threat_df <- data.frame()
for(i in 1:length(unique_terms)){
  cos_sims <- sort(subset(results_regime_df, term == unique_terms[i] & type == "threat")$cos_sim)
  unga_threat_df <- rbind(
    unga_threat_df,
    data.frame(
      term = unique_terms[i],
      cos_mean = mean(cos_sims),
      type = "threat",
      embeddings = "UNGA"
    )
  )
}

unga_democ_df <- data.frame()
for(i in 1:length(unique_terms)){
  cos_sims <- sort(subset(results_regime_df, term == unique_terms[i] & type == "democ")$cos_sim)
  unga_democ_df <- rbind(
    unga_democ_df,
    data.frame(
      term = unique_terms[i],
      cos_mean = mean(cos_sims),
      type = "democ",
      embeddings = "UNGA"
    )
  )
}

unga_regime_df <- rbind(unga_threat_df,unga_democ_df)



# --- GloVe:
word_vectors <- readRDS("data/word_vectors/glove_200d_vectors.rds")
glove_embeddings <- t(word_vectors)

# --- threat dimension
threat_embeddings <- glove_embeddings[,colnames(glove_embeddings) %in% threat_words]
threat_embeddings <- if(is.null(nrow(threat_embeddings))){threat_embeddings}else{rowMeans(threat_embeddings)}
nonthreat_embeddings <- glove_embeddings[,colnames(glove_embeddings) %in% nonthreat_words]
nonthreat_embeddings <- if(is.null(nrow(nonthreat_embeddings))){nonthreat_embeddings}else{rowMeans(nonthreat_embeddings)}
threat_nonthreat_d <-  threat_embeddings - nonthreat_embeddings
threat_nonthreat_d_glove <- threat_nonthreat_d

# --- regime type dimension
democ_embeddings <- glove_embeddings[,colnames(glove_embeddings) %in% democ_words]
democ_embeddings <- if(is.null(nrow(democ_embeddings))){democ_embeddings}else{rowMeans(democ_embeddings)}
nondemoc_embeddings <- glove_embeddings[,colnames(glove_embeddings) %in% nondemoc_words]
nondemoc_embeddings <- if(is.null(nrow(nondemoc_embeddings))){nondemoc_embeddings}else{rowMeans(nondemoc_embeddings)}
democ_nondemoc_d <-  democ_embeddings - nondemoc_embeddings
democ_nondemoc_d_glove <- democ_nondemoc_d

# --- moral words
moral_embeddings <- glove_embeddings[,colnames(glove_embeddings) %in% moral_words]

# --- immoral words
immoral_embeddings <- glove_embeddings[,colnames(glove_embeddings) %in% immoral_words]
moral_embeddings <- cbind(moral_embeddings, immoral_embeddings)

results_democ_df <- data.frame()
for(i in 1:ncol(moral_embeddings)){
  results_democ_df <-
    rbind(results_democ_df,
          data.frame(
            term = colnames(moral_embeddings)[i],
            cos_sim = lsa::cosine(moral_embeddings[,i], democ_nondemoc_d_glove)
          ))
  
}

results_threat_df <- data.frame()
for(i in 1:ncol(moral_embeddings)){
  results_threat_df <-
    rbind(results_threat_df,
          data.frame(
            term = colnames(moral_embeddings)[i],
            cos_sim = lsa::cosine(moral_embeddings[,i], threat_nonthreat_d_glove)
          ))
}
results_democ_df$embeddings <-  "glove"
results_democ_df$type <- "democ"
results_threat_df$embeddings <- "glove"
results_threat_df$type <- "threat"



# --- plot it
plot_regime_unga <- data.frame(
  threat = subset(unga_regime_df, type == "threat")$cos_mean,
  democracy = subset(unga_regime_df, type == "democ")$cos_mean,
  type = "UNGA",
  term = as.character(subset(unga_regime_df, type == "threat")$term) #identical(subset(unga_regime_df, type == "threat")$term, subset(unga_regime_df, type == "democ")$term)
)

plot_regime_glove <- data.frame(
  threat = results_threat_df$cos_sim,
  democracy = results_democ_df$cos_sim,
  type = "glove",
  term = as.character(results_threat_df$term) #identical(results_threat_df$term, results_democ_df$term)
)
plot_regime_df <- rbind(plot_regime_unga, plot_regime_glove)

# --- Figure A1: threat vs. regime type projections --- #
ggplot(plot_regime_df, aes(x = democracy, y = threat, color = type)) +
  geom_hline(yintercept = 0, size = .4, color = "gray50") +
  geom_vline(xintercept = 0, size = .4, color = "gray50") +
  geom_point() +
  geom_text_repel(aes(label = term), segment.colour="gray80", size = 4.2) +
  scale_color_manual(values = c( "gray20", "#40939e"),
                     name = "Text Source", labels = c("\nUNGD\nSpeeches\n", "Quotidian\nHuman Texts")) + 
  theme_minimal() +
  coord_cartesian(xlim = c(-.45,.5), ylim = c(-.3,.3))+
  labs(y = "Threat", x = "Democracy") +
  guides(colour=guide_legend(override.aes=list(size=4))) +
  theme(axis.text.y=element_text(size=15, color="black"),
        axis.text.x=element_text(size=15, color="black"),
        legend.text = element_text(size=16),
        axis.title.x = element_text(size = 20, margin = margin(t = 14, b = 0, l = 0)),
        axis.title.y = element_text(size = 20, margin = margin(t = 0, r = 14, b = 0, l = 0)),
        legend.title = element_text(size = 18),
        legend.spacing.y = unit(.3, 'cm'))
#ggsave("threat_democracy_dimensions.pdf", width = 11, height = 13)




# --- FRUS analysis on uncombined corpora --- #
# here, we just show that the FRUS results are robust to analyzing the two corpora separately, as opposed to combining 
file_list <- list.files(path = "data/word_vectors/")
file_list <- file_list[grepl("frussub", file_list)]

results_df <- data.frame()
for(k in 1:length(file_list)){
  
  word_vectors <- readRDS(paste("data/word_vectors/", file_list[k], sep = "")) 
  
  frus_embeddings <- t(word_vectors)
  
  
  # --- threat dimension
  threat_embeddings <- frus_embeddings[,colnames(frus_embeddings) %in% threat_words_stemmed]
  threat_embeddings <- if(is.null(nrow(threat_embeddings))){threat_embeddings}else{rowMeans(threat_embeddings)}
  nonthreat_embeddings <- frus_embeddings[,colnames(frus_embeddings) %in% nonthreat_words_stemmed]
  nonthreat_embeddings <- if(is.null(nrow(nonthreat_embeddings))){nonthreat_embeddings}else{rowMeans(nonthreat_embeddings)}
  threat_nonthreat_d <-  threat_embeddings - nonthreat_embeddings
  threat_nonthreat_d_political <- threat_nonthreat_d
  
  # --- harm dimension
  harm_embeddings <- frus_embeddings[,colnames(frus_embeddings) %in% harm_words_stemmed]
  harm_embeddings <- if(is.null(nrow(harm_embeddings))){harm_embeddings}else{rowMeans(harm_embeddings)}
  nonharm_embeddings <- frus_embeddings[,colnames(frus_embeddings) %in% nonharm_words_stemmed]
  nonharm_embeddings <- if(is.null(nrow(nonharm_embeddings))){nonharm_embeddings}else{rowMeans(nonharm_embeddings)}
  harm_nonharm_d <-  harm_embeddings - nonharm_embeddings
  harm_nonharm_d_political <- harm_nonharm_d
  
  # --- moral words
  moral_embeddings <- frus_embeddings[,colnames(frus_embeddings) %in% moral_words_stemmed]
  
  
  # --- immoral words
  immoral_embeddings <- frus_embeddings[,colnames(frus_embeddings) %in% immoral_words_stemmed]
  moral_embeddings <- cbind(moral_embeddings, immoral_embeddings)
  
  results_harm_df <- data.frame()
  for(i in 1:ncol(moral_embeddings)){
    
    results_harm_df <- 
      rbind(results_harm_df,
            data.frame(
              term = colnames(moral_embeddings)[i],
              cos_sim = lsa::cosine(moral_embeddings[,i], harm_nonharm_d)
            ))
  }
  
  results_threat_df <- data.frame()
  for(i in 1:ncol(moral_embeddings)){
    
    results_threat_df <- 
      rbind(results_threat_df,
            data.frame(
              term = colnames(moral_embeddings)[i],
              cos_sim = lsa::cosine(moral_embeddings[,i], threat_nonthreat_d)
            ))
  }
  
  
  
  results_harm_df$side <- ifelse(results_harm_df$term %in% colnames(immoral_embeddings), "positive", "negative")
  results_harm_df$correct <- ifelse(results_harm_df$side == "positive" & results_harm_df$cos_sim>0, "true", 
                                    ifelse(results_harm_df$side == "negative" & results_harm_df$cos_sim<0, "true", "false"))
  
  
  results_threat_df$side <- ifelse(results_threat_df$term %in% colnames(immoral_embeddings), "positive", "negative")
  results_threat_df$correct <- ifelse(results_threat_df$side == "positive" & results_threat_df$cos_sim>0, "true", 
                                      ifelse(results_threat_df$side == "negative" & results_threat_df$cos_sim<0, "true", "false"))
  
  
  
  
  results_harm_df$embeddings <-  file_list[k]
  results_harm_df$type <- "harm"
  results_threat_df$embeddings <- file_list[k]
  results_threat_df$type <- "threat"
  
  results_df <- 
    rbind(results_df, 
          rbind(results_harm_df, results_threat_df)
    )
  
  
}

results_df$embeddings <- ifelse(results_df$embeddings == "frussub_soviets_200d_vectors.rds", "FRUS (Soviets, 1952-77)  ",
                                ifelse(results_df$embeddings == "frussub_lauretig_200d_vectors.rds", "FRUS (Universe, 1964-66)  ", NA))
results_df$embeddings <- factor(results_df$embeddings, levels = c("FRUS (Soviets, 1952-77)  ",
                                                                  "FRUS (Universe, 1964-66)  "))

results_df$y_pos <- 
  ifelse(results_df$embeddings ==  "FRUS (Soviets, 1952-77)  ", .8, 
         ifelse(results_df$embeddings ==  "FRUS (Universe, 1964-66)  ", 1.4, 2))
results_df$name_labels <- as.character(results_df$term)

# --- Figure A4 harm plot --- #
ggplot(subset(results_df, type == "harm" & embeddings %in% c("FRUS (Soviets, 1952-77)  ",
                                                             "FRUS (Universe, 1964-66)  ")), aes(colour = embeddings)) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "gray60") +
  geom_segment(y = .8, yend=.8, x = -.42, xend = .42, arrow = arrow(length=unit(0.4,"cm"), ends="both", type = "closed"),
               size = 1.2,  colour = "grey40") +
  geom_segment(y = 1.4, yend=1.4, x = -.42, xend = .42, arrow = arrow(length=unit(0.4,"cm"), ends="both", type = "closed"),
               size = 1.2,  colour = "grey40") +
  geom_point(aes(x = cos_sim, y = y_pos, group = embeddings), size = 5,colour = "black") +
  geom_point(aes(x = cos_sim, y = y_pos, group = embeddings, colour = embeddings), size = 4) +
  geom_text_repel(aes(x = cos_sim, y = y_pos, group = embeddings, label = name_labels), colour = "black", segment.colour="gray50", segment.size = .4,
                  nudge_y = 0.15, direction = "y", hjust = 0, size = 7, force = .2) +
  scale_colour_manual(values = c("gray60", "gray90"), guide = guide_legend(reverse = F)) +
  theme_bw() +
  coord_flip(xlim = c(-.4, .4), ylim = c(.7,1.8)) +
  guides(colour = guide_legend(override.aes = list(size=10))) +
  scale_x_continuous(breaks = c(-.5, -.4, -.3, -.2, -.1, 0, .1, .2, .3)) +
  labs(x = "Harm Dimension") +
  theme(axis.title.x = element_blank() ,
        axis.title.y = element_text(size=25, margin = margin(r=24)),
        axis.text.y=element_text(size=20),
        axis.text.x=element_blank(),
        legend.title = element_blank(),
        axis.ticks.y = element_line(color="black", size = .2),
        axis.ticks.x = element_blank(),
        axis.line.x =  element_blank(),
        legend.position = "top",
        legend.spacing.x = unit(.4, 'cm'),
        legend.box.background = element_rect(colour = "gray30"),
        legend.text=element_text(size=20),
        plot.margin = margin(r = 160, l=40, t = 10, b = 10))
#ggsave("harm_dimensions_frus_uncombined.pdf", width = 10, height = 14)

# --- Figure A4 threat plot --- #
ggplot(subset(results_df, type == "threat" & embeddings %in% c("FRUS (Soviets, 1952-77)  ",
                                                               "FRUS (Universe, 1964-66)  ")), aes(colour = embeddings)) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "gray60") +
  geom_segment(y = .8, yend=.8, x = -.42, xend = .42, arrow = arrow(length=unit(0.4,"cm"), ends="both", type = "closed"),
               size = 1.2,  colour = "grey40") +
  geom_segment(y = 1.4, yend=1.4, x = -.42, xend = .42, arrow = arrow(length=unit(0.4,"cm"), ends="both", type = "closed"),
               size = 1.2,  colour = "grey40") +
  geom_point(aes(x = cos_sim, y = y_pos, group = embeddings), size = 5,colour = "black") +
  geom_point(aes(x = cos_sim, y = y_pos, group = embeddings, colour = embeddings), size = 4) +
  geom_text_repel(aes(x = cos_sim, y = y_pos, group = embeddings, label = name_labels), colour = "black", segment.colour="gray50", segment.size = .4,
                  nudge_y = 0.15, direction = "y", hjust = 0, size = 7, force = .2) +
  scale_colour_manual(values = c("gray60", "gray90"), guide = guide_legend(reverse = F)) +
  theme_bw() +
  coord_flip(xlim = c(-.4, .4), ylim = c(.7,1.8)) +
  guides(colour = guide_legend(override.aes = list(size=10))) +
  scale_x_continuous(breaks = c(-.5, -.4, -.3, -.2, -.1, 0, .1, .2, .3)) +
  labs(x = "Threat Dimension") +
  theme(axis.title.x = element_blank() ,
        axis.title.y = element_text(size=25, margin = margin(r=24)),
        axis.text.y=element_text(size=20),
        axis.text.x=element_blank(),
        legend.title = element_blank(),
        axis.ticks.y = element_line(color="black", size = .2),
        axis.ticks.x = element_blank(),
        axis.line.x =  element_blank(),
        legend.position = "top",
        legend.spacing.x = unit(.4, 'cm'),
        legend.box.background = element_rect(colour = "gray30"),
        legend.text=element_text(size=20),
        plot.margin = margin(r = 160, l=40, t = 10, b = 10))
#ggsave("threat_dimensions_frus_uncombined.pdf", width = 10, height = 14)

